#!/bin/bash

# Copyright (c) Los Alamos National Security, LLC, and others.

. ./environment.sh

cd $DATADIR

set +e
exec 2>&1  # send stderr to stdout to make output more readable


# Check argument parsing
x hashsplit
echo $?
x hashsplit 2
echo $?
x hashsplit foo
echo $?
x hashsplit 0 foo
echo $?
y "hashsplit '' foo || echo 1"
y "hashsplit 2 '' || echo 1"

# Check directory and file creation.
y "hashsplit 2 a < /dev/null"
x ls -R a

# Check failure on can't can't create directory because parent doesn't exist.
y "hashsplit 2 /does/not/exist < /dev/null || echo 1"

# Check failure on file where directory should be.
x touch thisisafile
y "hashsplit 2 thisisafile/out < /dev/null"

# Check directory already existing does not fail.
x mkdir b
y "hashsplit 2 b < /dev/null"
x ls -R b

# Check that it works (note that we repeat "a" several times to make sure
# we're hashing just the key and not the whole line), including with UTF-8.
cat > in.txt <<EOF
a	1
b	2
c	3
a	4
b	5
a	6
a	7
a	8
a	9
a	10
a	11
a	12
a	13
a	14
a	15
a	16
a	17
a	18
a	19
a	20
a	21
nullvaluewithtab	
nullvaluenotab
私の名前は中野です
EOF
x cat in.txt
y "hashsplit 2 out < in.txt"
x cat out/0
x cat out/1

# Now split with much wider degree. This is mainly used to test that other
# hash functions are consistent. The value 240 is from the default open files
# limit of 256 (ulimit -n) on OS X Mountain Lion.
#
# Note that both "nullvaluenotab" and the Japanese text hash mod 240 is 19. I
# am pretty well convinced this is a coincidence. I ran /usr/share/dict/words
# through hashsplit and ended up with approximately 17% maximum variation in
# the size of 100 buckets, which seems reasonable. The actual hash values of
# the two strings differ.
y "hashsplit 240 out < in.txt"
x cat out/37
x cat out/145
x cat out/5
